Stat 115 Lab 5

RNA-seq

Yushi Tang

February 26/28, 2019

Outline

Anouncements

RNA-seq analysis workflow

Wang, Z., Gerstein, M., & Snyder, M. (2009). RNA-Seq: a revolutionary tool for transcriptomics. Nature reviews genetics, 10(1), 57.

Wang, Z., Gerstein, M., & Snyder, M. (2009). RNA-Seq: a revolutionary tool for transcriptomics. Nature reviews genetics, 10(1), 57.

Fragment alignment

Star alignment

#!/bin/bash
#SBATCH -N 1                   # Number of nodes
#SBATCH -n 10                  # Number of cores
#SBATCH -t 180                 # Runtime in minutes (0~10080)
#SBATCH -p general             # Partition
#SBATCH --mem=50000            # Total memory (varies across nodes)
#SBATCH -o star_%j.out         # Standard out goes to this file
#SBATCH -e star_%j.err         # Standard err goes to this file
#SBATCH --mail-type=END        # Email
#SBATCH --mail-user=YOUR_EMAIL

module load gcc/4.8.2-fasrc01 STAR/2.5.0c-fasrc02

STAR --genomeDir $GENOME \
     --readFilesIn $FASTQ1 $FASTQ2 \
     --outFileNamePrefix $OUTDIR/ \
     --outSAMprimaryFlag AllBestScore \
     --outSAMtype BAM SortedByCoordinate \
     --runThreadN 10 \
     --alignEndsType EndToEnd 
sbatch STARalignment.sh

Salmon tutorial

Salmon tutorial 1

Salmon tutorial 1

How to train your Salmon

Salmon Tutorial

Salmon Tutorial

Salmon tutorial

Salmon tutorial 2

Salmon tutorial 2

Salmon tutorial

Salmon tutorial 3

Salmon tutorial 3

Salmon tutorial

Salmon tutorial 4

Salmon tutorial 4

Salmon tutorial

Salmon tutorial 5

Salmon tutorial 5

Salmon tutorial

Salmon tutorial 6

Salmon tutorial 6

Salmon tutorial

Salmon tutorial 7

Salmon tutorial 7

Salmon tutorial

Salmon tutorial 8

Salmon tutorial 8

Salmon tutorial

Salmon tutorial 9

Salmon tutorial 9

Salmon tutorial

Salmon tutorial 10

Salmon tutorial 10

Salmon tutorial

Salmon tutorial 11

Salmon tutorial 11

Salmon tutorial

Enjoy your Salmon

Enjoy your Salmon

Salmon alignment

#!/bin/bash
#SBATCH -N 1                   # Number of nodes
#SBATCH -n 10                  # Number of cores
#SBATCH -t 180                 # Runtime in minutes (0~10080)
#SBATCH -p general             # Partition
#SBATCH --mem=50000            # Total memory (varies across nodes)
#SBATCH -o star_%j.out         # Standard out goes to this file
#SBATCH -e star_%j.err         # Standard err goes to this file
#SBATCH --mail-type=END        # Email
#SBATCH --mail-user=YOUR_EMAIL

module load salmon

salmon index -t $TRANSCRIPTOME -i $INDEX
sbatch createSalmonIndex.sh
#!/bin/bash
#SBATCH -N 1                   # Number of nodes
#SBATCH -n 10                  # Number of cores
#SBATCH -t 180                 # Runtime in minutes (0~10080)
#SBATCH -p general             # Partition
#SBATCH --mem=50000            # Total memory (varies across nodes)
#SBATCH -o star_%j.out         # Standard out goes to this file
#SBATCH -e star_%j.err         # Standard err goes to this file
#SBATCH --mail-type=END        # Email
#SBATCH --mail-user=YOUR_EMAIL

module load salmon

salmon quant -i $INDEX \
             -l A \
             -1 $FASTQ/ENCFF500PDO_sub.fastq\
             -2 $FASTQ/ENCFF708KQE_sub.fastq \
             -o $OUT \
             --numBootstraps 100 \
             -p 10 \
             --gcBias
sbatch Salmonalignment.sh

Running your own alignment

Differential expression

# Install required packages
source("https://bioconductor.org/biocLite.R")
biocLite("BiocUpgrade")
biocLite("DESeq2")
biocLite("tximport")
biocLite("EnsDb.Hsapiens.v86")
biocLite("EnsDb.Mmusculus.v79")
install.packages("rjson")

Differential expression

library(DESeq2)
files <- grep("sf",list.files("Data"),value=TRUE)
condition <- c("4oh", "4oh", "4oh", "ctrl", "ctrl", "ctrl")
names <- c("4oh1", "4oh2", "4oh3", "ctrl1", "ctrl2", "ctrl3")
sampleTable <- data.frame(sampleName = files, fileName = files, condition = condition)

Differential expression

library(EnsDb.Mmusculus.v79)
txdf <- transcripts(EnsDb.Mmusculus.v79, return.type="DataFrame")
tx2gene <- as.data.frame(txdf[,c("tx_id", "gene_id")])

Differential expression

library(tximport)
txi <- tximport(file.path("Data",files), type="salmon", ignoreTxVersion = TRUE, tx2gene = tx2gene)
dds <- DESeqDataSetFromTximport(txi,colData=sampleTable,design=~condition)
dds <- dds[rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds)

Differential expression

res <- results(dds, alpha = 0.05)
res <- res[complete.cases(res),]
res <- res[order(res$padj),]
upR <- res[(res$padj < 0.05) & (res$log2FoldChange > 0),]
downR <- res[(res$padj < 0.05) & (res$log2FoldChange < 0),]
nrow(upR)
## [1] 382
nrow(downR) 
## [1] 518

Visualizing results

plotMA(res)

absOrdered <- rbind(upR,downR)
absOrdered <- absOrdered[order(abs(absOrdered$log2FoldChange),decreasing = TRUE),]
mostvariable <- log2(txi$abundance[row.names(absOrdered),]+.0001)

library(gplots)
heatmap.2(mostvariable[1:100,],trace="none",col=greenred(10))

Odyssey Fest

Odyssey Fest

#!/bin/bash
#SBATCH -N 1                   # Number of nodes
#SBATCH -n 10                  # Number of cores
#SBATCH -t 240                 # Runtime in minutes (0~10080)
#SBATCH -p general,serial_requeue,shared  # Partition
#SBATCH --mem=50000            # Total memory (varies across nodes)
#SBATCH -o salmon_%j.out         # Standard out goes to this file
#SBATCH -e salmon_%j.err         # Standard err goes to this file
#SBATCH --mail-type=END        # Email
#SBATCH --mail-user=ytang@hsph.harvard.edu

module load salmon

salmon index -t $TRANSCRIPTOME -i $INDEX

Basic Operations Reminder (A Small Cheat Sheet)

Basic steps to access the cluster

Basic steps to access the cluster

Useful commands for data management

Useful commands for data management

Cluster Computing Reminder (A Small Cheat Sheet)

Inquire path on the Odyssey

Inquire path on the Odyssey

Upload scripts to the Odyssey

Upload scripts to the Odyssey

View script list

View script list

View specific script

View specific script

View specific script

View specific script

Manage current jobs

Manage current jobs

Download the output

Download the output

Good Luck!